In [1]:

    
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import matplotlib
import seaborn as sns
import scipy


from IPython.display import HTML

Assessing the relative impacts of constraints and calibrations on dating accuracy



In [2]:

    
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')









    Out[2]:





The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click here.

Analysis of 1200 MCMC runs on simulated data.

The simulation was performed on 10 different trees, used to generate alignments. For each tree, we gathered up to 5 calibration points (0, 1, 3, 5), and up to 50 constraints (0, 1, 3, 5, 20, 50). We ran the MCMC in each combination of conditions (=16 combinations), in 5 replicates, changing the nature of the calibrations and constraints. So in total we have $10~trees*24~conditions*5~replicates=1200~points$.

Plot of the 10 chronograms and the 10 trees with altered branch lengths



In [3]:

    
from ete3 import Tree, TreeStyle, NodeStyle

def readTreeFromFile(file):
    try:
        f=open(file, 'r')
    except IOError:
        print ("Unknown file: "+file)
        sys.exit()

    line = ""
    for l in f:
        line += l.strip()
    
    f.close()
    t = Tree( line )
    return t

# chronograms:
files = ["AnalysisBEFORE06052019/FirstAttempt/extantTree_1_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_2_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_3_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_4_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_5_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_6_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_7_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_8_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_9_rescaled.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_10_rescaled.dnd"]

ts = TreeStyle()
ts.min_leaf_separation= 10
ts.scale = 2020
ts.show_leaf_name = False
nstyle = NodeStyle()
nstyle["size"] = 0

thickness = 1
vt_line_width =  2
hz_line_width =  2

ts.scale =  thickness * 2400 # 120 pixels per branch length unit
#ts.min_leaf_separation= 10
ts.show_scale = False

trees = list()
i = 0
for f in files:
    trees.append(readTreeFromFile(f))
    for n in trees[i].traverse():
       n.set_style(nstyle)
    trees[i].render("extantTree"+str(i+1)+".png", tree_style=ts)
    i = i +1

    
    
# rescaled trees:
files = ["AnalysisBEFORE06052019/FirstAttempt/extantTree_1_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_2_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_3_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_4_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_5_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_6_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_7_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_8_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_9_rescaled_altered.dnd","AnalysisBEFORE06052019/FirstAttempt/extantTree_10_rescaled_altered.dnd"]

ts = TreeStyle()
ts.min_leaf_separation= 10
ts.scale = 2020 
ts.show_leaf_name = False
nstyle = NodeStyle()
nstyle["size"] = 0

thickness = 1
vt_line_width =  2
hz_line_width =  2

ts.scale =  thickness * 2400 # 120 pixels per branch length unit
#ts.min_leaf_separation= 10
#ts.show_scale = False

trees = list()
i = 0
for f in files:
    trees.append(readTreeFromFile(f))
    for n in trees[i].traverse():
       n.set_style(nstyle)
    trees[i].render("extantTree"+str(i+1)+"_rescaled_altered.png", tree_style=ts)
    i = i +1

Reading the data



In [4]:

    
colNames = ["treeId","numCalib","numCons","numReplicate","correlation", "rmsd", "correlation_bl", "rmsd_bl", "numNodes","numInHPD","fracInHPD","percent0","percent25","percent50","percent75","percent100"]
d = pd.read_csv ("resultAllTrees.txt", sep="\t", header=None, names=colNames)



In [5]:

    
d.describe()









    Out[5]:







  
    
      
      treeId
      numCalib
      numCons
      numReplicate
      correlation
      rmsd
      correlation_bl
      rmsd_bl
      numNodes
      numInHPD
      fracInHPD
      percent0
      percent25
      percent50
      percent75
      percent100
    
  
  
    
      count
      1400.000000
      1400.000000
      1400.000000
      1400.000000
      1400.000000
      1400.000000
      1400.000000
      1400.000000
      1400.0
      1400.000000
      1400.000000
      1390.000000
      1390.000000
      1390.000000
      1390.000000
      1390.000000
    
    
      mean
      5.500000
      2.250000
      12.714286
      3.000000
      0.896741
      3.004010
      0.671940
      2.484593
      29.0
      16.212857
      55.906404
      0.922015
      2.166397
      3.452197
      5.118951
      7.323804
    
    
      std
      2.873308
      1.920973
      16.495840
      1.414719
      0.084683
      2.324495
      0.174765
      1.350331
      0.0
      5.885042
      20.293247
      0.766357
      1.168117
      1.664224
      2.480621
      3.634441
    
    
      min
      1.000000
      0.000000
      0.000000
      1.000000
      0.499728
      0.520120
      -0.134308
      0.403570
      29.0
      0.000000
      0.000000
      0.000010
      0.180254
      0.206052
      0.206052
      0.206052
    
    
      25%
      3.000000
      0.750000
      1.000000
      2.000000
      0.859657
      1.674116
      0.573120
      1.744306
      29.0
      12.000000
      41.379310
      0.523212
      1.349880
      2.332442
      3.466733
      4.909215
    
    
      50%
      5.500000
      2.000000
      5.000000
      3.000000
      0.921131
      2.226946
      0.704012
      2.139378
      29.0
      17.000000
      58.620690
      0.791527
      1.994972
      3.202436
      4.579952
      6.444696
    
    
      75%
      8.000000
      3.500000
      20.000000
      4.000000
      0.958706
      3.262030
      0.805392
      2.694706
      29.0
      20.000000
      68.965517
      1.193830
      2.783481
      4.466631
      6.438647
      9.037210
    
    
      max
      10.000000
      5.000000
      50.000000
      5.000000
      0.996401
      15.680831
      0.988294
      10.224202
      29.0
      28.000000
      96.551724
      11.319780
      11.462858
      11.870524
      15.111232
      22.906924

Analysis of the impact of constraints

In the following 6 plots, we investigate the impact of constraints, not controlling for the number of calibrations.



In [6]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["correlation_bl"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Correlation in branch lengths')









    Out[6]:





[Text(0,0.5,'Correlation in branch lengths'),
 Text(0.5,0,'Number of constraints')]



In [7]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["rmsd_bl"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["rmsd_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='RMSD in branch lengths')









    Out[7]:





[Text(0,0.5,'RMSD in branch lengths'), Text(0.5,0,'Number of constraints')]



In [8]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["rmsd"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["rmsd"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='RMSD in node ages')









    Out[8]:





[Text(0,0.5,'RMSD in node ages'), Text(0.5,0,'Number of constraints')]



In [9]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["correlation"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["correlation"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Correlation with true dates')









    Out[9]:





[Text(0,0.5,'Correlation with true dates'),
 Text(0.5,0,'Number of constraints')]



In [10]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["fracInHPD"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["fracInHPD"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Fraction of dates in 95% HPD')









    Out[10]:





[Text(0,0.5,'Fraction of dates in 95% HPD'),
 Text(0.5,0,'Number of constraints')]



In [11]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["percent50"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["percent50"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Median size of the 95% HPD')









    Out[11]:





[Text(0,0.5,'Median size of the 95% HPD'), Text(0.5,0,'Number of constraints')]

Partial conclusion on the above

As the number of constraints increases from 0 to 50, the accuracy of the reconstruction improves a little bit, despite smaller 95%HPD. Using all constraints available, we usually get better results, and smaller 95%HPD.

Analysis of the impact of calibrations

In the following 6 plots, we investigate the impact of calibrations, not controlling for the number of constraints.



In [12]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["rmsd_bl"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["rmsd_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='RMSD in branch lengths')









    Out[12]:





[Text(0,0.5,'RMSD in branch lengths'), Text(0.5,0,'Number of calibrations')]



In [13]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["correlation_bl"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Correlation in branch lengths')









    Out[13]:





[Text(0,0.5,'Correlation in branch lengths'),
 Text(0.5,0,'Number of calibrations')]



In [14]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["rmsd"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["rmsd"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='RMSD in node ages')









    Out[14]:





[Text(0,0.5,'RMSD in node ages'), Text(0.5,0,'Number of calibrations')]



In [15]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["correlation"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["correlation"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Correlation with true dates')









    Out[15]:





[Text(0,0.5,'Correlation with true dates'),
 Text(0.5,0,'Number of calibrations')]



In [16]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["fracInHPD"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["fracInHPD"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Fraction in 95% HPD')









    Out[16]:





[Text(0,0.5,'Fraction in 95% HPD'), Text(0.5,0,'Number of calibrations')]



In [17]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["percent50"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["percent50"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Median size of the 95% HPD')









    Out[17]:





[Text(0,0.5,'Median size of the 95% HPD'),
 Text(0.5,0,'Number of calibrations')]

Partial conclusion on the above

As the number of calibrations increases from 0 to 5, the accuracy of the reconstruction improves, despite smaller 95%HPD. The effect of calibrations seems stronger than that of constraints.

Let's look at constraints, controlling for calibrations

In the following 10 plots, we separate the impacts of constraints and calibrations.



In [18]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd_bl", x="numCalib", data=d)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in branch lengths')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[18]:





<matplotlib.legend.Legend at 0x7f0c3c455b00>



In [19]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation_bl", x="numCalib", data=d)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in branch lengths')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[19]:





<matplotlib.legend.Legend at 0x7f0c47c831d0>



In [20]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd", x="numCalib", data=d)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in node ages')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[20]:





<matplotlib.legend.Legend at 0x7f0c3c01c748>



In [21]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation", x="numCalib", data=d)
ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in node ages')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[21]:





<matplotlib.legend.Legend at 0x7f0c3c064080>



In [22]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="fracInHPD", x="numCalib", data=d)
ax.set_ylim(50, 90)
ax.set(xlabel='Number of calibrations', ylabel='Average fraction in 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[22]:





<matplotlib.legend.Legend at 0x7f0c316a09b0>



In [23]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent0", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average smallest 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[23]:





<matplotlib.legend.Legend at 0x7f0c316a0cf8>



In [24]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent25", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average 1st quartile 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[24]:





<matplotlib.legend.Legend at 0x7f0c314f6128>



In [25]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent50", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average median 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[25]:





<matplotlib.legend.Legend at 0x7f0c3c397048>



In [26]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent75", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average 3rd quartile 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[26]:





<matplotlib.legend.Legend at 0x7f0c312f7cc0>



In [27]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent100", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average maximum 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints", "20 constraints","50 constraints"])









    Out[27]:





<matplotlib.legend.Legend at 0x7f0c311cf160>

Partial conclusion on the above

Constraints and calibrations both improve accuracy. The number of constraints does not seem to have much of an effect on the size of the 95% HPD, contrary to calibrations, unless you get to lots of constraints (20, 5à or 100). However, when using large numbers of constraints, results are much improved, although with a lot of variance if there aren't any calibration. It is possible that on some trees convergence was difficult in the fully constrained runs.

Comparison between trees

In the following 6 plots, we look at the impact of the 10 different trees.

First let's order the trees from good to bad according to the correlation in node ages when all constraints are used:



In [28]:

    
d_sel = d.loc[d['numCons'] == 100]
print(d_sel.sort_values(['correlation'], ascending=False))









    



Empty DataFrame
Columns: [treeId, numCalib, numCons, numReplicate, correlation, rmsd, correlation_bl, rmsd_bl, numNodes, numInHPD, fracInHPD, percent0, percent25, percent50, percent75, percent100]
Index: []



In [29]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["rmsd"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["rmsd"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Rmsd in node ages')









    Out[29]:





[Text(0,0.5,'Rmsd in node ages'), Text(0.5,0,'Tree ID')]



In [30]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["correlation"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["correlation"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Correlation in node ages')









    Out[30]:





[Text(0,0.5,'Correlation in node ages'), Text(0.5,0,'Tree ID')]



In [31]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["rmsd_bl"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["rmsd_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Rmsd in branch lengths')









    Out[31]:





[Text(0,0.5,'Rmsd in branch lengths'), Text(0.5,0,'Tree ID')]



In [32]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["correlation_bl"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Correlation in branch lengths')









    Out[32]:





[Text(0,0.5,'Correlation in branch lengths'), Text(0.5,0,'Tree ID')]



In [33]:

    
%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["fracInHPD"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["fracInHPD"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Fraction in 95% HPD')









    Out[33]:





[Text(0,0.5,'Fraction in 95% HPD'), Text(0.5,0,'Tree ID')]



In [34]:

    
%matplotlib inline
ax = sns.barplot(hue="numReplicate", y="fracInHPD", x="treeId", data=d)
ax.set(xlabel='Tree ID', ylabel='Average fraction in 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["Replicate 1", "Replicate 2", "Replicate 3", "Replicate 4", "Replicate 5"])









    Out[34]:





<matplotlib.legend.Legend at 0x7f0c30f8b748>

Partial conclusion on the above

Some trees are consistently easy, some consistently hard, even when we use all constraints. If we had to order the trees from easy to hard, I'd say: 3 8 1 5 4 7 9 10 2 6

Results obtained on easy trees 3 and 8



In [35]:

    
d_easy = d.loc[d['treeId'].isin([3,8])]
d_easy.describe()









    Out[35]:







  
    
      
      treeId
      numCalib
      numCons
      numReplicate
      correlation
      rmsd
      correlation_bl
      rmsd_bl
      numNodes
      numInHPD
      fracInHPD
      percent0
      percent25
      percent50
      percent75
      percent100
    
  
  
    
      count
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.0
      280.000000
      280.000000
      277.000000
      277.000000
      277.000000
      277.000000
      277.000000
    
    
      mean
      5.500000
      2.250000
      12.714286
      3.000000
      0.925598
      1.829125
      0.588282
      1.819418
      29.0
      15.707143
      54.162562
      1.012165
      1.832303
      2.658059
      3.549981
      4.659655
    
    
      std
      2.504476
      1.923725
      16.519473
      1.416746
      0.056042
      0.684613
      0.243125
      0.576184
      0.0
      7.051268
      24.314717
      0.988119
      1.066427
      1.282957
      1.441752
      1.528877
    
    
      min
      3.000000
      0.000000
      0.000000
      1.000000
      0.578225
      0.591677
      -0.134308
      0.587738
      29.0
      0.000000
      0.000000
      0.131346
      0.206052
      0.206052
      0.206052
      0.206052
    
    
      25%
      3.000000
      0.750000
      1.000000
      2.000000
      0.906250
      1.409188
      0.411883
      1.511924
      29.0
      11.000000
      37.931034
      0.552695
      1.270310
      1.858943
      2.680850
      3.864000
    
    
      50%
      5.500000
      2.000000
      5.000000
      3.000000
      0.933050
      1.814342
      0.623940
      1.867186
      29.0
      16.000000
      55.172414
      0.827379
      1.766160
      2.660796
      3.568374
      4.847240
    
    
      75%
      8.000000
      3.500000
      20.000000
      4.000000
      0.960114
      2.199241
      0.781380
      2.156142
      29.0
      20.000000
      68.965517
      1.312498
      2.313160
      3.276017
      4.469970
      5.620991
    
    
      max
      8.000000
      5.000000
      50.000000
      5.000000
      0.993490
      5.060508
      0.953065
      4.140936
      29.0
      28.000000
      96.551724
      11.319780
      11.319780
      11.319780
      11.319780
      11.319780



In [36]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd_bl", x="numCalib", data=d_easy)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in branch lengths, easy trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints","20 constraints","50 constraints" ])









    Out[36]:





<matplotlib.legend.Legend at 0x7f0c30d5ac88>



In [37]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation_bl", x="numCalib", data=d_easy)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in branch lengths, easy trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints","20 constraints","50 constraints" ])









    Out[37]:





<matplotlib.legend.Legend at 0x7f0c44e6df98>



In [38]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd", x="numCalib", data=d_easy)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in node ages, easy trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints","20 constraints","50 constraints" ])









    Out[38]:





<matplotlib.legend.Legend at 0x7f0c30add3c8>



In [39]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation", x="numCalib", data=d_easy)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in node ages, easy trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints","20 constraints","50 constraints" ])









    Out[39]:





<matplotlib.legend.Legend at 0x7f0c309db2e8>

Results obtained on hard trees 2 and 10

I decided against tree 6 where variance of the different runs is too wide.



In [40]:

    
d_hard = d.loc[d['treeId'].isin([2,10])]
d_hard.describe()









    Out[40]:







  
    
      
      treeId
      numCalib
      numCons
      numReplicate
      correlation
      rmsd
      correlation_bl
      rmsd_bl
      numNodes
      numInHPD
      fracInHPD
      percent0
      percent25
      percent50
      percent75
      percent100
    
  
  
    
      count
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.000000
      280.0
      280.000000
      280.000000
      278.000000
      278.000000
      278.000000
      278.000000
      278.000000
    
    
      mean
      6.000000
      2.250000
      12.714286
      3.000000
      0.907384
      2.887404
      0.663989
      2.433143
      29.0
      16.417857
      56.613300
      0.851531
      2.468896
      4.094352
      6.173579
      8.942895
    
    
      std
      4.007162
      1.923725
      16.519473
      1.416746
      0.077800
      1.503842
      0.150929
      0.777563
      0.0
      5.394179
      18.600618
      0.749418
      1.512983
      2.148340
      3.005151
      3.858153
    
    
      min
      2.000000
      0.000000
      0.000000
      1.000000
      0.612940
      0.621928
      0.249167
      1.269837
      29.0
      0.000000
      0.000000
      0.034629
      0.202552
      0.233199
      0.256224
      0.271627
    
    
      25%
      2.000000
      0.750000
      1.000000
      2.000000
      0.865381
      1.930992
      0.545123
      1.947593
      29.0
      13.000000
      44.827586
      0.517698
      1.260794
      2.576794
      4.031428
      5.979279
    
    
      50%
      6.000000
      2.000000
      5.000000
      3.000000
      0.926691
      2.527060
      0.673071
      2.345736
      29.0
      17.000000
      58.620690
      0.756845
      2.075388
      3.379728
      5.263088
      8.121213
    
    
      75%
      10.000000
      3.500000
      20.000000
      4.000000
      0.971860
      3.479149
      0.802864
      2.760133
      29.0
      20.000000
      68.965517
      1.088855
      3.693723
      5.774506
      8.545029
      11.858223
    
    
      max
      10.000000
      5.000000
      50.000000
      5.000000
      0.994280
      9.388842
      0.917169
      7.006532
      29.0
      27.000000
      93.103448
      11.055193
      11.462858
      11.870524
      15.111232
      20.556620



In [41]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd_bl", x="numCalib", data=d_hard)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in branch lengths, hard trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","10 constraints","20 constraints","50 constraints" ])









    Out[41]:





<matplotlib.legend.Legend at 0x7f0c309115c0>



In [42]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation_bl", x="numCalib", data=d_hard)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in branch lengths, hard trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","20 constraints","50 constraints","100 constraints" ])









    Out[42]:





<matplotlib.legend.Legend at 0x7f0c309cb5c0>



In [43]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd", x="numCalib", data=d_hard)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in node ages, hard trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","20 constraints","50 constraints","100 constraints" ])









    Out[43]:





<matplotlib.legend.Legend at 0x7f0c3094c320>



In [44]:

    
%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation", x="numCalib", data=d_hard)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in node ages, hard trees')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints","20 constraints","50 constraints","100 constraints" ])









    Out[44]:





<matplotlib.legend.Legend at 0x7f0c30628d30>

Partial conclusion on the above

Constraints seem to help a little bit more on the easy trees than on the hard trees, but the effect is not massive. With all constraints, we see a nice improvement in both cases, but more variance on the two hard trees.

Variation among replicates, controlling for a given set of conditions, focusing on constraints

For a given number of constraints, calibrations, and a given tree, we want to assess how variable the results are, focusing on the impact of calibrations (whose number has been color-coded).



In [45]:

    
#groupedByReplicates = d.groupby(["treeId","numCalib","numCons"], group_keys=True)
#groupedByReplicates.describe()

#groupedByReplicates.columns = ["_".join(x) for x in groupedByReplicates.columns.ravel()]
#groupedByReplicates.describe()

#groupedByReplicates["correlation"]

cors=pd.DataFrame({'var' : d.groupby( ["treeId","numCalib","numCons"] )["correlation"].std()}).reset_index()
# Let's drop the case with 100 constraints
cors = cors[cors["numCons"] != 100]
cors = cors.reset_index(drop=True)
#cors.head(130)

    #pd.boxplot_frame_groupby(groupedByReplicates)
#groupedByReplicates["correlation"].boxplot()
#print(d.groupby(["treeId","numCalib","numCons"])["correlation"].var().unstack())
#d.groupby(["treeId","numCalib","numCons"])["correlation"].var().unstack().boxplot()



In [46]:

    
%matplotlib inline

# Select the color map named rainbow
cmap = plt.cm.get_cmap(name='tab10')


fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, cors["var"][i], mark, color = cmap(i % 6))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 constraint'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='20 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='50 constraints'))



ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in correlation", fontsize=15)









    Out[46]:





Text(0,0.5,'Standard deviation in correlation')



In [47]:

    
%matplotlib inline

rmsd=pd.DataFrame({'var' : d.groupby( ["treeId","numCalib","numCons"] )["rmsd"].std()}).reset_index()

# Let's drop the case with 100 constraints
rmsd = rmsd[rmsd["numCons"] != 100]
rmsd = rmsd.reset_index(drop=True)
rmsd.head(130)


# Select the color map
cmap = plt.cm.get_cmap(name='tab10')

fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, rmsd["var"][i], mark, color = cmap(i % 6))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 constraint'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(4), label='20 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(5), label='50 constraints'))


ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in node age rmsd", fontsize=15)









    Out[47]:





Text(0,0.5,'Standard deviation in node age rmsd')



In [48]:

    
%matplotlib inline

fracInHPD=pd.DataFrame({'var' : d.groupby( ["treeId","numCalib","numCons"] )["fracInHPD"].std()}).reset_index()

# Let's drop the case with 100 constraints
fracInHPD = fracInHPD[fracInHPD["numCons"] != 100]
fracInHPD = fracInHPD.reset_index(drop=True)
fracInHPD.head(130)


# Select the color map
cmap = plt.cm.get_cmap(name='tab10')

fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, fracInHPD["var"][i], mark, color = cmap(i % 6))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 constraint'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(4), label='20 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(5), label='50 constraints'))


ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in fraction of nodes in 95% HPD", fontsize=15)









    Out[48]:





Text(0,0.5,'Standard deviation in fraction of nodes in 95% HPD')



In [49]:

    
%matplotlib inline

percent50=pd.DataFrame({'var' : d.groupby( ["treeId","numCalib","numCons"] )["percent50"].std()}).reset_index()


# Select the color map
cmap = plt.cm.get_cmap(name='tab10')

fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, percent50["var"][i], mark, color = cmap(i % 4))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 constraint'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(4), label='20 constraints'))
patches.append(matplotlib.patches.Patch(color=cmap(5), label='50 constraints'))


ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in median 95% HPD size", fontsize=15)









    Out[49]:





Text(0,0.5,'Standard deviation in median 95% HPD size')

Partial conclusion on the above

It's hard to come up with much to say about the above graphs, except that even controlling for a given number of constraints on a given tree, there can be variation in the accuracy or the 95% HPD. This is expected given that constraints could be positioned on different nodes of the tree.

Variation among replicates, controlling for a given set of conditions, focusing on calibrations

For a given number of constraints, calibrations, and a given tree, we want to assess how variable the results are, focusing on the impact of calibrations (whose number has been color-coded).



In [50]:

    
# Group analyses by tree, and recover correlation only
cors=pd.DataFrame({'var' : d.groupby( ["treeId","numCons", "numCalib"] )["correlation"].std()}).reset_index()

# Let's drop the case with 100 constraints
cors = cors[cors["numCons"] != 100]
cors = cors.reset_index(drop=True)
cors.head(130)









    Out[50]:







  
    
      
      treeId
      numCons
      numCalib
      var
    
  
  
    
      0
      1
      0
      0
      0.076899
    
    
      1
      1
      0
      1
      0.040677
    
    
      2
      1
      0
      3
      0.099492
    
    
      3
      1
      0
      5
      0.022494
    
    
      4
      1
      1
      0
      0.054938
    
    
      5
      1
      1
      1
      0.110788
    
    
      6
      1
      1
      3
      0.019939
    
    
      7
      1
      1
      5
      0.029835
    
    
      8
      1
      3
      0
      0.036859
    
    
      9
      1
      3
      1
      0.059481
    
    
      10
      1
      3
      3
      0.023403
    
    
      11
      1
      3
      5
      0.022189
    
    
      12
      1
      5
      0
      0.083978
    
    
      13
      1
      5
      1
      0.034038
    
    
      14
      1
      5
      3
      0.031221
    
    
      15
      1
      5
      5
      0.034348
    
    
      16
      1
      10
      0
      0.018320
    
    
      17
      1
      10
      1
      0.017281
    
    
      18
      1
      10
      3
      0.088846
    
    
      19
      1
      10
      5
      0.018175
    
    
      20
      1
      20
      0
      0.105158
    
    
      21
      1
      20
      1
      0.195866
    
    
      22
      1
      20
      3
      0.012141
    
    
      23
      1
      20
      5
      0.088245
    
    
      24
      1
      50
      0
      0.011758
    
    
      25
      1
      50
      1
      0.208232
    
    
      26
      1
      50
      3
      0.009524
    
    
      27
      1
      50
      5
      0.008664
    
    
      28
      2
      0
      0
      0.026899
    
    
      29
      2
      0
      1
      0.042165
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      100
      4
      10
      0
      0.036134
    
    
      101
      4
      10
      1
      0.071339
    
    
      102
      4
      10
      3
      0.029556
    
    
      103
      4
      10
      5
      0.085925
    
    
      104
      4
      20
      0
      0.021639
    
    
      105
      4
      20
      1
      0.019184
    
    
      106
      4
      20
      3
      0.039281
    
    
      107
      4
      20
      5
      0.054794
    
    
      108
      4
      50
      0
      0.005852
    
    
      109
      4
      50
      1
      0.016160
    
    
      110
      4
      50
      3
      0.012278
    
    
      111
      4
      50
      5
      0.007079
    
    
      112
      5
      0
      0
      0.067950
    
    
      113
      5
      0
      1
      0.044067
    
    
      114
      5
      0
      3
      0.003846
    
    
      115
      5
      0
      5
      0.008510
    
    
      116
      5
      1
      0
      0.049216
    
    
      117
      5
      1
      1
      0.006538
    
    
      118
      5
      1
      3
      0.024210
    
    
      119
      5
      1
      5
      0.011739
    
    
      120
      5
      3
      0
      0.053329
    
    
      121
      5
      3
      1
      0.053058
    
    
      122
      5
      3
      3
      0.021492
    
    
      123
      5
      3
      5
      0.016559
    
    
      124
      5
      5
      0
      0.049587
    
    
      125
      5
      5
      1
      0.017728
    
    
      126
      5
      5
      3
      0.016339
    
    
      127
      5
      5
      5
      0.017027
    
    
      128
      5
      10
      0
      0.069434
    
    
      129
      5
      10
      1
      0.018955
    
  

130 rows × 4 columns



In [51]:

    
%matplotlib inline


# Select the color map named tab10
cmap = plt.cm.get_cmap(name='tab10')


fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, cors["var"][i], mark, color = cmap(i % 4))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 calibration'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 calibrations'))


ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in correlation", fontsize=15)









    Out[51]:





Text(0,0.5,'Standard deviation in correlation')



In [52]:

    
%matplotlib inline

fracInHPD=pd.DataFrame({'var' : d.groupby( ["treeId","numCons", "numCalib"] )["rmsd"].std()}).reset_index()


# Let's drop the case with 100 constraints
rmsd = rmsd[rmsd["numCons"] != 100]
rmsd = rmsd.reset_index(drop=True)
rmsd.head(130)

fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, rmsd["var"][i], mark, color = cmap(i % 4))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 calibration'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 calibrations'))


ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in node age rmsd", fontsize=15)









    Out[52]:





Text(0,0.5,'Standard deviation in node age rmsd')



In [53]:

    
%matplotlib inline

fracInHPD=pd.DataFrame({'var' : d.groupby( ["treeId","numCons", "numCalib"] )["fracInHPD"].std()}).reset_index()


# Let's drop the case with 100 constraints
fracInHPD = fracInHPD[fracInHPD["numCons"] != 100]
fracInHPD = fracInHPD.reset_index(drop=True)
fracInHPD.head(130)

fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, fracInHPD["var"][i], mark, color = cmap(i % 4))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 calibration'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 calibrations'))


ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in fraction of nodes in 95% HPD", fontsize=15)









    Out[53]:





Text(0,0.5,'Standard deviation in fraction of nodes in 95% HPD')



In [54]:

    
%matplotlib inline

percent50=pd.DataFrame({'var' : d.groupby( ["treeId","numCons", "numCalib"] )["percent50"].std()}).reset_index()

# Let's drop the case with 100 constraints
percent50 = percent50[percent50["numCons"] != 100]
percent50 = percent50.reset_index(drop=True)
percent50.head(130)

fig, ax = plt.subplots(figsize=(20, 10))
for i in range(240):
    mark = ""
    if i < 24:
        mark="o"
    elif i < 48:
        mark="v"
    elif i < 72:
        mark="^"
    elif i < 96:
        mark="<"
    elif i < 120:
        mark=">"
    elif i < 144:
        mark="8"
    elif i < 168:
        mark="s"
    elif i < 192:
        mark="p"
    elif i < 216:
        mark="*"
    else:
        mark="h"
    ax.plot(i, percent50["var"][i], mark, color = cmap(i % 4))

patches=list()
patches.append(matplotlib.patches.Patch(color=cmap(0), label='0 calibration'))
patches.append(matplotlib.patches.Patch(color=cmap(1), label='1 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(2), label='3 calibrations'))
patches.append(matplotlib.patches.Patch(color=cmap(3), label='5 calibrations'))


ax.legend(handles=patches)

plt.xlabel("Condition", fontsize=15)
plt.ylabel("Standard deviation in median 95% HPD size", fontsize=15)









    Out[54]:





Text(0,0.5,'Standard deviation in median 95% HPD size')

Partial conclusion on the above

Similarly to the analyses focusing on the number of constraints, there can be some variation in some conditions, even with 5 calibrations.

Analysis of variation among replicates, another visualization



In [ ]:

Analysis of mixing in a random replicate

Here we just look at how well the MCMC seems to mix in a particular example, on tree 9, with varying numbers of constraints.



In [55]:

    
d0=pd.read_csv ("output/9_0_0_1.log", sep="\t")
d1=pd.read_csv ("output/9_0_1_1.log", sep="\t")
d3=pd.read_csv ("output/9_0_3_1.log", sep="\t")
d5=pd.read_csv ("output/9_0_5_1.log", sep="\t")
d10=pd.read_csv ("output/9_0_10_1.log", sep="\t")
d20=pd.read_csv ("output/9_0_20_1.log", sep="\t")
d50=pd.read_csv ("output/9_0_50_1.log", sep="\t")

d1.describe()









    Out[55]:







  
    
      
      Iteration
      Posterior
      Likelihood
      Prior
      birth_rate
      branch_rates[1]
      branch_rates[2]
      branch_rates[3]
      branch_rates[4]
      branch_rates[5]
      ...
      tmrca_clade_10
      tmrca_clade_2
      tmrca_clade_3
      tmrca_clade_4
      tmrca_clade_5
      tmrca_clade_6
      tmrca_clade_7
      tmrca_clade_8
      tmrca_clade_9
      turnover
    
  
  
    
      count
      100001.000000
      100001.000000
      100001.000000
      100001.000000
      100001.000000
      1.000010e+05
      100001.000000
      1.000010e+05
      100001.000000
      100001.000000
      ...
      100001.000000
      100001.000000
      100001.000000
      100001.000000
      100001.000000
      1.000010e+05
      100001.000000
      100001.000000
      100001.000000
      100001.000000
    
    
      mean
      500000.000000
      -10606.521077
      -10674.452340
      67.931307
      0.298130
      4.320533e-03
      0.013005
      6.499372e-03
      0.004998
      0.008406
      ...
      2.694008
      4.635063
      1.703691
      15.992084
      9.919549
      9.677923e+00
      2.866605
      9.152657
      14.037070
      0.099351
    
    
      std
      288679.464718
      44.376214
      12.853636
      45.267599
      0.035513
      7.610898e-03
      0.020168
      1.218161e-02
      0.005799
      0.008591
      ...
      1.857087
      4.252231
      1.755678
      0.700995
      0.359766
      1.807274e-11
      1.810737
      0.059576
      0.006944
      0.088035
    
    
      min
      0.000000
      -10808.300000
      -10737.600000
      -128.926000
      0.265445
      6.290130e-09
      0.000017
      2.442280e-08
      0.000026
      0.000072
      ...
      0.061079
      1.271515
      0.176993
      14.183960
      6.884904
      9.677923e+00
      0.129404
      8.978263
      13.678110
      0.000003
    
    
      25%
      250000.000000
      -10636.300000
      -10682.300000
      38.080900
      0.274163
      7.910900e-04
      0.003432
      1.084890e-03
      0.001810
      0.003707
      ...
      1.332526
      1.271515
      0.578223
      15.480860
      9.962201
      9.677923e+00
      1.528596
      9.173048
      14.037630
      0.031799
    
    
      50%
      500000.000000
      -10605.900000
      -10674.100000
      70.867100
      0.286870
      2.060690e-03
      0.007163
      2.941340e-03
      0.003250
      0.005902
      ...
      2.259453
      1.271515
      0.737581
      15.939180
      9.962201
      9.677923e+00
      2.462521
      9.173048
      14.037630
      0.074688
    
    
      75%
      750000.000000
      -10575.000000
      -10666.800000
      100.680000
      0.309593
      4.823230e-03
      0.014768
      7.114810e-03
      0.005978
      0.009933
      ...
      3.565025
      8.309437
      2.536261
      16.471400
      9.962201
      9.677923e+00
      3.789436
      9.173048
      14.037630
      0.142602
    
    
      max
      1000000.000000
      -10457.500000
      -10627.600000
      220.458000
      0.818629
      2.630060e-01
      0.953776
      5.115520e-01
      0.156790
      0.399855
      ...
      15.555700
      17.120050
      10.802100
      18.660790
      9.962201
      9.677923e+00
      14.472440
      9.173048
      14.037630
      0.675745
    
  

8 rows × 91 columns



In [56]:

    
%matplotlib inline

fig, ax = plt.subplots(7, figsize=(20, 10))
p1, = ax[0].plot(d0['Iteration'], d0['Posterior'], 'b-')
p2, = ax[1].plot(d1['Iteration'], d1['Posterior'], 'b-')
p3, = ax[2].plot(d3['Iteration'], d3['Posterior'], 'b-')
p4, = ax[3].plot(d5['Iteration'], d5['Posterior'], 'b-')
p5, = ax[4].plot(d10['Iteration'], d10['Posterior'], 'b-')
p6, = ax[5].plot(d20['Iteration'], d20['Posterior'], 'b-')
p7, = ax[6].plot(d50['Iteration'], d50['Posterior'], 'b-')

#cal, = ax[1].plot(d2['Iteration'], d2['Posterior'], 'g-')
#con, = ax[2].plot(d3['Iteration'], d3['Posterior'], 'r-')

ax[0].legend([p1], ['0 constraint'], loc='lower right')
ax[1].legend([p2], ['1 constraint'], loc='lower right')
ax[2].legend([p3], ['3 constraints'], loc='lower right')
ax[3].legend([p4], ['5 constraints'], loc='lower right')
ax[4].legend([p5], ['10 constraints'], loc='lower right')
ax[5].legend([p6], ['20 constraints'], loc='lower right')
ax[6].legend([p7], ['50 constraints'], loc='lower right')

#ax[1].legend([cal], ['Run 2'], loc='lower right')
#ax[2].legend([con], ['Run 3'], loc='lower right')

plt.xlabel("Iteration", fontsize=15)
plt.ylabel("Posterior", fontsize=15);

Partial conclusion on the above

From this very quick look, the mixing does not seem too problematic.

Conclusion

From these analyses, I would say:

Mixing of the MCMC does not seem too worrysome.
Calibrations help a lot.
Constraints help too, but probably a bit less. In particular, they have less of an effect on the reduction of 95% HPD. They tend to help for node age correlation, but not as much for branch lengths and rmsd.
However, using a large number of constraints or all constraints does produce good chronograms according to the metrics I used.
Focusing on easy trees or on the hard trees does not help much, although it seems like constraints may help a little bit more on easier trees. But given that constraints have their greatest impact on node age correlation, and that those correlations are very good on easy trees, it's not clear performing more easy simulations is going to help characterize the effect of constraints.



In [ ]:

	treeId	numCalib	numCons	numReplicate	correlation	rmsd	correlation_bl	rmsd_bl	numNodes	numInHPD	fracInHPD	percent0	percent25	percent50	percent75	percent100
count	1400.000000	1400.000000	1400.000000	1400.000000	1400.000000	1400.000000	1400.000000	1400.000000	1400.0	1400.000000	1400.000000	1390.000000	1390.000000	1390.000000	1390.000000	1390.000000
mean	5.500000	2.250000	12.714286	3.000000	0.896741	3.004010	0.671940	2.484593	29.0	16.212857	55.906404	0.922015	2.166397	3.452197	5.118951	7.323804
std	2.873308	1.920973	16.495840	1.414719	0.084683	2.324495	0.174765	1.350331	0.0	5.885042	20.293247	0.766357	1.168117	1.664224	2.480621	3.634441
min	1.000000	0.000000	0.000000	1.000000	0.499728	0.520120	-0.134308	0.403570	29.0	0.000000	0.000000	0.000010	0.180254	0.206052	0.206052	0.206052
25%	3.000000	0.750000	1.000000	2.000000	0.859657	1.674116	0.573120	1.744306	29.0	12.000000	41.379310	0.523212	1.349880	2.332442	3.466733	4.909215
50%	5.500000	2.000000	5.000000	3.000000	0.921131	2.226946	0.704012	2.139378	29.0	17.000000	58.620690	0.791527	1.994972	3.202436	4.579952	6.444696
75%	8.000000	3.500000	20.000000	4.000000	0.958706	3.262030	0.805392	2.694706	29.0	20.000000	68.965517	1.193830	2.783481	4.466631	6.438647	9.037210
max	10.000000	5.000000	50.000000	5.000000	0.996401	15.680831	0.988294	10.224202	29.0	28.000000	96.551724	11.319780	11.462858	11.870524	15.111232	22.906924

	treeId	numCalib	numCons	numReplicate	correlation	rmsd	correlation_bl	rmsd_bl	numNodes	numInHPD	fracInHPD	percent0	percent25	percent50	percent75	percent100
count	280.000000	280.000000	280.000000	280.000000	280.000000	280.000000	280.000000	280.000000	280.0	280.000000	280.000000	277.000000	277.000000	277.000000	277.000000	277.000000
mean	5.500000	2.250000	12.714286	3.000000	0.925598	1.829125	0.588282	1.819418	29.0	15.707143	54.162562	1.012165	1.832303	2.658059	3.549981	4.659655
std	2.504476	1.923725	16.519473	1.416746	0.056042	0.684613	0.243125	0.576184	0.0	7.051268	24.314717	0.988119	1.066427	1.282957	1.441752	1.528877
min	3.000000	0.000000	0.000000	1.000000	0.578225	0.591677	-0.134308	0.587738	29.0	0.000000	0.000000	0.131346	0.206052	0.206052	0.206052	0.206052
25%	3.000000	0.750000	1.000000	2.000000	0.906250	1.409188	0.411883	1.511924	29.0	11.000000	37.931034	0.552695	1.270310	1.858943	2.680850	3.864000
50%	5.500000	2.000000	5.000000	3.000000	0.933050	1.814342	0.623940	1.867186	29.0	16.000000	55.172414	0.827379	1.766160	2.660796	3.568374	4.847240
75%	8.000000	3.500000	20.000000	4.000000	0.960114	2.199241	0.781380	2.156142	29.0	20.000000	68.965517	1.312498	2.313160	3.276017	4.469970	5.620991
max	8.000000	5.000000	50.000000	5.000000	0.993490	5.060508	0.953065	4.140936	29.0	28.000000	96.551724	11.319780	11.319780	11.319780	11.319780	11.319780

	treeId	numCons	numCalib	var
0	1	0	0	0.076899
1	1	0	1	0.040677
2	1	0	3	0.099492
3	1	0	5	0.022494
4	1	1	0	0.054938
5	1	1	1	0.110788
6	1	1	3	0.019939
7	1	1	5	0.029835
8	1	3	0	0.036859
9	1	3	1	0.059481
10	1	3	3	0.023403
11	1	3	5	0.022189
12	1	5	0	0.083978
13	1	5	1	0.034038
14	1	5	3	0.031221
15	1	5	5	0.034348
16	1	10	0	0.018320
17	1	10	1	0.017281
18	1	10	3	0.088846
19	1	10	5	0.018175
20	1	20	0	0.105158
21	1	20	1	0.195866
22	1	20	3	0.012141
23	1	20	5	0.088245
24	1	50	0	0.011758
25	1	50	1	0.208232
26	1	50	3	0.009524
27	1	50	5	0.008664
28	2	0	0	0.026899
29	2	0	1	0.042165
...	...	...	...	...
100	4	10	0	0.036134
101	4	10	1	0.071339
102	4	10	3	0.029556
103	4	10	5	0.085925
104	4	20	0	0.021639
105	4	20	1	0.019184
106	4	20	3	0.039281
107	4	20	5	0.054794
108	4	50	0	0.005852
109	4	50	1	0.016160
110	4	50	3	0.012278
111	4	50	5	0.007079
112	5	0	0	0.067950
113	5	0	1	0.044067
114	5	0	3	0.003846
115	5	0	5	0.008510
116	5	1	0	0.049216
117	5	1	1	0.006538
118	5	1	3	0.024210
119	5	1	5	0.011739
120	5	3	0	0.053329
121	5	3	1	0.053058
122	5	3	3	0.021492
123	5	3	5	0.016559
124	5	5	0	0.049587
125	5	5	1	0.017728
126	5	5	3	0.016339
127	5	5	5	0.017027
128	5	10	0	0.069434
129	5	10	1	0.018955

	Iteration	Posterior	Likelihood	Prior	birth_rate	branch_rates[1]	branch_rates[2]	branch_rates[3]	branch_rates[4]	branch_rates[5]	...	tmrca_clade_10	tmrca_clade_2	tmrca_clade_3	tmrca_clade_4	tmrca_clade_5	tmrca_clade_6	tmrca_clade_7	tmrca_clade_8	tmrca_clade_9	turnover
count	100001.000000	100001.000000	100001.000000	100001.000000	100001.000000	1.000010e+05	100001.000000	1.000010e+05	100001.000000	100001.000000	...	100001.000000	100001.000000	100001.000000	100001.000000	100001.000000	1.000010e+05	100001.000000	100001.000000	100001.000000	100001.000000
mean	500000.000000	-10606.521077	-10674.452340	67.931307	0.298130	4.320533e-03	0.013005	6.499372e-03	0.004998	0.008406	...	2.694008	4.635063	1.703691	15.992084	9.919549	9.677923e+00	2.866605	9.152657	14.037070	0.099351
std	288679.464718	44.376214	12.853636	45.267599	0.035513	7.610898e-03	0.020168	1.218161e-02	0.005799	0.008591	...	1.857087	4.252231	1.755678	0.700995	0.359766	1.807274e-11	1.810737	0.059576	0.006944	0.088035
min	0.000000	-10808.300000	-10737.600000	-128.926000	0.265445	6.290130e-09	0.000017	2.442280e-08	0.000026	0.000072	...	0.061079	1.271515	0.176993	14.183960	6.884904	9.677923e+00	0.129404	8.978263	13.678110	0.000003
25%	250000.000000	-10636.300000	-10682.300000	38.080900	0.274163	7.910900e-04	0.003432	1.084890e-03	0.001810	0.003707	...	1.332526	1.271515	0.578223	15.480860	9.962201	9.677923e+00	1.528596	9.173048	14.037630	0.031799
50%	500000.000000	-10605.900000	-10674.100000	70.867100	0.286870	2.060690e-03	0.007163	2.941340e-03	0.003250	0.005902	...	2.259453	1.271515	0.737581	15.939180	9.962201	9.677923e+00	2.462521	9.173048	14.037630	0.074688
75%	750000.000000	-10575.000000	-10666.800000	100.680000	0.309593	4.823230e-03	0.014768	7.114810e-03	0.005978	0.009933	...	3.565025	8.309437	2.536261	16.471400	9.962201	9.677923e+00	3.789436	9.173048	14.037630	0.142602
max	1000000.000000	-10457.500000	-10627.600000	220.458000	0.818629	2.630060e-01	0.953776	5.115520e-01	0.156790	0.399855	...	15.555700	17.120050	10.802100	18.660790	9.962201	9.677923e+00	14.472440	9.173048	14.037630	0.675745